Catherine Heller Jeremy Hess Rudra Menon
Final Project CMSC320 2019
library(magrittr)
library(rvest)
## Loading required package: xml2
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.0.1 ✔ purrr 0.3.0
## ✔ readr 1.3.1 ✔ stringr 1.3.1
## ✔ tibble 2.0.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
## ✖ purrr::set_names() masks magrittr::set_names()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
stats <- read_csv("SpaceBastards-stats.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Date/Time` = col_datetime(format = ""),
## `Point Elapsed Seconds` = col_double(),
## `Our Score - End of Point` = col_double(),
## `Their Score - End of Point` = col_double(),
## `Hang Time (secs)` = col_double(),
## `Player 7` = col_logical(),
## `Player 8` = col_logical(),
## `Player 9` = col_logical(),
## `Player 10` = col_logical(),
## `Player 11` = col_logical(),
## `Player 12` = col_logical(),
## `Player 13` = col_logical(),
## `Player 14` = col_logical(),
## `Player 15` = col_logical(),
## `Player 16` = col_logical(),
## `Player 17` = col_logical(),
## `Player 18` = col_logical(),
## `Player 19` = col_logical(),
## `Player 20` = col_logical(),
## `Player 21` = col_logical()
## # ... with 7 more columns
## )
## See spec(...) for full column specifications.
## Warning: 4838 parsing failures.
## row col expected actual file
## 1 -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## 2 -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## 3 -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## 4 -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## 5 -- 52 columns 42 columns 'SpaceBastards-stats.csv'
## ... ... .......... .......... .........................
## See problems(...) for more details.
stats <- stats %>%
select("Date/Time", tournament = "Tournamemnt", opponent = "Opponent", time = "Point Elapsed Seconds", "Line",
ourscore = "Our Score - End of Point", theirscore = "Their Score - End of Point", "Event Type", "Action",
"Passer", rec = "Receiver", "Defender", p0 = "Player 0", p1 = "Player 1", p2 = "Player 2", p3 = "Player 3", p4 = "Player 4",
p5 = "Player 5", p6 = "Player 6")
stats
## # A tibble: 4,827 x 19
## `Date/Time` tournament opponent time Line ourscore theirscore
## <dttm> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 2 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 3 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 4 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 5 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 6 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 7 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 8 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 9 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## 10 2019-04-01 00:09:00 Easterns Northea… 229 O 1 0
## # … with 4,817 more rows, and 12 more variables: `Event Type` <chr>,
## # Action <chr>, Passer <chr>, rec <chr>, Defender <chr>, p0 <chr>,
## # p1 <chr>, p2 <chr>, p3 <chr>, p4 <chr>, p5 <chr>, p6 <chr>
plusminus <- stats
# if Action is drop, make player the receiver, if there is no passer, make it the defender
plusminus$Player <- ifelse(is.na(plusminus$Passer), plusminus$Defender,
ifelse(plusminus$Action == "Drop", plusminus$rec, plusminus$Passer))
# add state for
plusminus <- plusminus %>% rbind(plusminus %>%
filter(Action=="Goal") %>%
mutate(Action="Score", Player=rec))
# count actions per player
plusminus <- plusminus %>%
select(Player, Action, rec) %>%
group_by(Player) %>%
count(Action) %>%
select(Player, Action, n)
rows <- dim(plusminus)[1]; plusminus <- plusminus[1:(rows - 6),] #Remove last 6 entries, not useful
# reshape dataframe and remove "0" and "Anonymous
plusminus <- plusminus%>%
group_by(Player) %>%
spread(key="Action",value=n) %>%
filter(Player != "0" & Player != "Anonymous")
plusminus[is.na(plusminus)] <- 0
# add column for plu_minus
plusminus$plus_minus <- plusminus$Callahan + plusminus$D + plusminus$Goal+ plusminus$Score - plusminus$Stall - plusminus$Throwaway - plusminus$Drop
plusminus
## # A tibble: 31 x 12
## # Groups: Player [31]
## Player Callahan Catch D Drop Goal Pull PullOb Score Stall
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aaron 0 8 1 4 0 0 0 8 0
## 2 Adam W 0 410 4 5 20 1 1 4 0
## 3 Ari 0 69 6 8 11 0 0 27 0
## 4 Austin 0 16 6 2 1 0 0 3 0
## 5 Baugh 0 17 15 0 0 0 0 14 0
## 6 Berg 0 90 3 2 7 0 0 2 0
## 7 Boots 0 173 31 8 58 31 14 63 0
## 8 Brand… 0 4 3 2 0 0 0 0 0
## 9 Colin 0 45 5 3 0 0 0 8 0
## 10 DFB 0 384 5 11 18 32 5 5 1
## # … with 21 more rows, and 2 more variables: Throwaway <dbl>,
## # plus_minus <dbl>
#calculate passing percentage
passing<- plusminus %>% select(Player, Completions= "Catch", Assists="Goal", "Throwaway", "Stall")
passing$pass_perc <- (passing$Completions + passing$Assists)/ (passing$Completions + passing$Assists + passing$Throwaway + passing$Stall) * 100
passing
## # A tibble: 31 x 6
## # Groups: Player [31]
## Player Completions Assists Throwaway Stall pass_perc
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aaron 8 0 0 0 100
## 2 Adam W 410 20 20 0 95.6
## 3 Ari 69 11 7 0 92.0
## 4 Austin 16 1 7 0 70.8
## 5 Baugh 17 0 0 0 100
## 6 Berg 90 7 5 0 95.1
## 7 Boots 173 58 29 0 88.8
## 8 Brandon 4 0 1 0 80
## 9 Colin 45 0 1 0 97.8
## 10 DFB 384 18 22 1 94.6
## # … with 21 more rows
# group by point (differentiated by tournament, game, and score)
points <- stats %>%
select(tournament, opponent, ourscore, theirscore, p0, p1, p2, p3, p4, p5, p6) %>%
group_by(tournament, opponent, ourscore, theirscore, p0, p1, p2, p3, p4, p5 ,p6) %>%
count()
# count appearances of each name
points <-points %>%
ungroup() %>%
select(p0, p1, p2, p3, p4, p5 ,p6)
points_played <- as.data.frame(table(unlist(points)))
points_played
## Var1 Freq
## 1 Aaron 59
## 2 Adam W 177
## 3 Ari 213
## 4 Austin 63
## 5 Baugh 126
## 6 Berg 155
## 7 Boots 307
## 8 Brandon 29
## 9 Colin 134
## 10 DFB 223
## 11 Drew 11
## 12 Grant 219
## 13 Greenlee 30
## 14 Jack 63
## 15 Jeremy 186
## 16 Jimmy 63
## 17 Jip 35
## 18 Joel 193
## 19 Johnny 132
## 20 Luke 175
## 21 Mason 143
## 22 Matt Joy 52
## 23 Michael 15
## 24 Moose 131
## 25 Paul 30
## 26 Rudy 340
## 27 Ryan 60
## 28 Sheedy 42
## 29 Steve 171
## 30 Theo 46
## 31 Will 137
plot1 <- plusminus
plot1$points_played <- points_played$Freq
plot1 <- plot1%>%
group_by(Player) %>%
summarise_all(funs(first(na.omit(.)))) %>%
select(Player, plus_minus, points_played)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
plot1$per_point <- plot1$plus_minus/ plot1$points_played
plot <- plot1 %>% ggplot(mapping = aes(label=Player, x=per_point, y=points_played)) +
geom_point()+
geom_smooth(method=lm)
ggplotly(plot, tooltip = c("Player", "per_point","points_played"))
plot1$prediction <- predict(loess(points_played~per_point,plot1), plot1$per_point)
plot1$diff<- abs(plot1$prediction - plot1$points_played)
plot1$hyp <- plot1$diff < 50
plot1
## # A tibble: 31 x 7
## Player plus_minus points_played per_point prediction diff hyp
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <lgl>
## 1 Aaron 5 59 0.0847 95.1 36.1 TRUE
## 2 Adam W 3 177 0.0169 82.3 94.7 FALSE
## 3 Ari 29 213 0.136 172. 40.5 TRUE
## 4 Austin 1 63 0.0159 82.8 19.8 TRUE
## 5 Baugh 29 126 0.230 120. 6.40 TRUE
## 6 Berg 5 155 0.0323 77.9 77.1 FALSE
## 7 Boots 115 307 0.375 301. 5.66 TRUE
## 8 Brandon 0 29 0 91.3 62.3 FALSE
## 9 Colin 9 134 0.0672 74.0 60.0 FALSE
## 10 DFB -6 223 -0.0269 110. 113. FALSE
## # … with 21 more rows
n <- 31
pa <- 0.5
ex <- pa
var_x <- pa * ( 1 - pa) / n
mean <- length(plot1$hyp[plot1$hyp==TRUE])/n
std <- (sqrt(var_x))
p_value <- 1-pnorm(mean,ex,std)
p_value
## [1] 0.8154143
plot <- plot1 %>% ggplot(mapping = aes(label=Player, x=plus_minus, y=points_played)) +
geom_point()+
geom_smooth(method=lm)
ggplotly(plot, tooltip = c("Player", "plus_minus","points_played"))
pass_perc <- passing
pass_perc$points_played <- points_played$Freq
pass_perc <- pass_perc%>%
summarise_all(funs(first(na.omit(.)))) %>%
select(Player, pass_perc, points_played)
pass_perc
## # A tibble: 31 x 3
## Player pass_perc points_played
## <chr> <dbl> <int>
## 1 Aaron 100 59
## 2 Adam W 95.6 177
## 3 Ari 92.0 213
## 4 Austin 70.8 63
## 5 Baugh 100 126
## 6 Berg 95.1 155
## 7 Boots 88.8 307
## 8 Brandon 80 29
## 9 Colin 97.8 134
## 10 DFB 94.6 223
## # … with 21 more rows
plot <- pass_perc %>% ggplot(mapping = aes(label=Player, x=pass_perc, y=points_played)) +
geom_point()+
geom_smooth(method=lm)
ggplotly(plot, tooltip = c("Player"))